{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ee05a0d8-27fb-404e-8277-3da413f2c1a7",
   "metadata": {},
   "source": [
    "## 用 **pandas** 直接获取网页表格"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "0f420e12-bbd9-4a36-a11a-f80d557a4f13",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 2 tables.\n",
      "        0         1        2    3    4   5\n",
      "1   专业组代号      再选科目      批次线  最高分  最低分  高出\n",
      "2  C10301        不限  435/527  572  559  32\n",
      "3  C10302        不限  435/527  579  564  37\n",
      "4  C10303        不限  435/527  582  561  34\n",
      "5  C10304       化或地  435/527  576  561  34\n",
      "6  C10305        政治  435/527  572  563  36\n",
      "7  C10306        政治  435/527  559  539  12\n",
      "8  C10307  不限（国家专项）  435/527  567  553  26\n",
      "9  C10308  不限（国家专项）  435/527  567  556  29\n",
      "--------------------------------------------------------------------------------\n",
      "         0          1        2    3    4   5\n",
      "1    专业组代号       再选科目      批次线  最高分  最低分  高出\n",
      "2   C10309         不限  409/504  573  559  55\n",
      "3   C10310         不限  409/504  567  554  50\n",
      "4   C10311         不限  409/504  565  552  48\n",
      "5   C10312         不限  409/504  571  554  50\n",
      "6   C10313         不限  409/504  585  556  52\n",
      "7   C10314         不限  409/504  574  546  42\n",
      "8   C10315         化学  409/504  569  542  38\n",
      "9   C10316        化或生  409/504  575  543  39\n",
      "10  C10317        化或地  409/504  566  552  48\n",
      "11  C10318   化学（中外机构）  409/504  535  514  10\n",
      "12  C10319   不限（中外机构）  409/504  558  528  24\n",
      "13  C10320    化（中外项目）  409/504  541  523  24\n",
      "14  C10321  化或生（中外项目）  409/504  540  512   8\n",
      "15  C10322   不限（国家专项）  409/504  555  546  42\n",
      "16  C10323   不限（国家专项）  409/504  561  545  41\n",
      "17  C10324  化或生（国家专项）  409/504  557  536  32\n",
      "18  C10325   化学（国家专项）  409/504  545  539  35\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from io import StringIO\n",
    "import requests\n",
    "\n",
    "url = 'https://zsxx.hubu.edu.cn/info/1017/1655.htm'\n",
    "\n",
    "headers = {\n",
    "    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'\n",
    "}\n",
    "\n",
    "# 使用自定义头部信息获取HTML内容\n",
    "response = requests.get(url, headers=headers)\n",
    "#转编码格式为utf-8，处理中文\n",
    "response.encoding = 'utf-8'  # 例如 'gb2312', 'gbk' 或 'utf-8'\n",
    "\n",
    "html_string = response.text\n",
    "# 使用StringIO对象包装HTML字符串\n",
    "string_io = StringIO(html_string)\n",
    "\n",
    "# 从StringIO对象中读取表格\n",
    "tables = pd.read_html(string_io)\n",
    "# 打印读取到的表格数量\n",
    "print(f'Found {len(tables)} tables.')\n",
    "# print(tables)\n",
    "\n",
    "\n",
    "df1 = tables[0]\n",
    "df1.drop(index=0,inplace=True)\n",
    "print(df1)\n",
    "print(10*'--------')\n",
    "df1.to_csv('data/hubu_首选科目历史.csv',index=None,header=None)\n",
    "df2=tables[1]\n",
    "df2.drop(index=0,inplace=True)\n",
    "print(df2)\n",
    "df2.to_csv('data/hubu_首选科目物理.csv',index=None,header=None)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
